import json
import spotipy
import pandas as pd
import io
import re
import networkx as nx
import matplotlib.pyplot as plt
import collections
from fa2 import ForceAtlas2
from networkx.readwrite import json_graph
import numpy as np
from matplotlib import colors
import operator
from community import community_louvain
import matplotlib.colors as pltcolors
df_artists = pd.read_csv(r'hiphopArtists_new.csv')
nameList = list(df_artists["artist_name"])
df_artists.tail(10)
if 'songs_with_feats_new3.json':
with open('songs_with_feats_new3.json', 'r') as f:
songs = json.load(f)
In this step, we make a dictionary where every artist's name is a key, and the value is another dictionary, containing each artist he/she collaborated with as a key, and the number of songs they collaborated on
total_songs = []
artist_tracks_data = songs["artists"]
artist_collab_dict = {}
for artist_name, artist_tracks in artist_tracks_data.items():
# The inner dictionary that is the value for the artist's name as the key
collabs = {}
track_lookup = set()
for track in artist_tracks["tracks"]:
feats = track['feats']
track_name = track['track_name']
# Ensuring that the artist has collaborators and we haven't processed the same track twice
if len(feats) > 0 and track_name not in track_lookup:
track_lookup.add(track['track_name'])
for feat_artist in feats:
# Checking to see if the featured artist is in our dataframe
if feat_artist in nameList:
# add a new entry to the dictionary or increment the total collaborations
if feat_artist not in collabs:
collabs[feat_artist] = 1
else:
collabs[feat_artist] += 1
if len(collabs) > 0:
artist_collab_dict[artist_name] = collabs
artist_collab_dict["Drake"]
Here, we are creating the network. First, we add every node, and give it a weight equal to its degree. We also add edges as follows: First, we iterate through every artist, and all of his collaborators, adding an edge between each artist and his collaborator.
If we add an edge between artist A and B, and already see there was an edge created between artists B and A, we update the weight of that edge to be the sum of number of songs from A featuring B, and the number of songs from B featuring A.
G_rap = nx.Graph()
# we create an dictionary of edges we have already added. Each entry is of the form (A, B) --> num_songs
# where A is the main artist, B is the collaborator, and num_songs = the number of songs they worked on
lookup_edges = {}
# adding nodes
for artist, collabs in artist_collab_dict.items():
G_rap.add_node(artist, weight=len(collabs), collabs = set(collabs.keys()))
# adding edges
for artist, collabs in artist_collab_dict.items():
for collab, num_songs in collabs.items():
lookup_edges[(artist, collab)] = num_songs
G_rap.add_edge(artist, collab, weight = num_songs)
# add in collaborations in reverse direction
if (collab, artist) in lookup_edges:
num_songs_reverse = lookup_edges[(collab, artist)]
G_rap.add_edge(artist, collab, weight = num_songs + num_songs_reverse)
print("Total number of nodes:" ,len(G_rap))
print("-----------")
print("Total number of links", G_rap.size())
print("-----------")
print("Density", nx.density(G_rap))
Getting the artists who have collaborated with others the most. Here is the list!
degree_sequence = sorted([d for n, d in G_rap.degree()], reverse=True)
rap_sorted = sorted(G_rap.degree, key=lambda x: x[1], reverse=True)
print("- Top 10 by degree -")
for i in range(0,10):
print("#"+str(i+1)+" :")
print("Rapper: ",rap_sorted[i][0])
print("Total collaborators: ", rap_sorted[i][1])
print('-----')
# Degree Distribution
degrees = [G_rap.degree(n) for n in G_rap.nodes()]
plt.figure(figsize = (10, 8))
plt.hist(degrees, bins = 20, edgecolor='black', )
plt.xlabel('Number of collaborators')
plt.ylabel('Count')
plt.title('Degree Histogram of Rappers Network')
plt.xticks(list(range(10, 90)[::10]))
plt.show()
Here, we examine several measures of Centrality to see which rappers have the most well-connected and influential
Top Artists by Betweenness Centrality
# Betweenness Centrality
bt_ctrs = [(k, v) for k, v in nx.betweenness_centrality(G_rap).items()]
sorted(bt_ctrs, key=lambda x: x[1], reverse = True)[:20]
Top Artists by Eigenvector Centrality
# Eigenvector Centrality
eig_ctrs = [(k, v) for k, v in nx.eigenvector_centrality(G_rap).items()]
sorted(eig_ctrs, key=lambda x: x[1], reverse = True)[:20]
# Degree vs Betweenness Centrality
deg_ctr_dict = nx.degree_centrality(G_rap)
bt_ctr_dict = nx.betweenness_centrality(G_rap)
eig_ctr_dict = nx.eigenvector_centrality(G_rap)
x = [v for k, v in deg_ctr_dict.items()]
y = [bt_ctr_dict[k] for k in deg_ctr_dict.keys()]
plt.figure(figsize=(15,10))
plt.scatter(x, y, alpha = 0.5)
plt.title('Degree Centrality vs Betweenness Centrality')
plt.xlabel('Degree Centrality')
plt.ylabel('Betweenness Centrality')
plt.show()
# Degree vs Eigenvector Centrality
x = [v for k, v in deg_ctr_dict.items()]
y = [eig_ctr_dict[k] for k in deg_ctr_dict.keys()]
plt.figure(figsize=(15,10))
plt.scatter(x, y, alpha = 0.5)
plt.title('Degree Centrality vs Eigenvector Centrality')
plt.xlabel('Degree Centrality')
plt.ylabel('Eigenvector Centrality')
plt.show()
It's interesting to note how well Degree Centrality aligns much better with Eigenvector Centrality than Betweenness Centrality.
Eigenvector centrality measures a node's "influence" on a network, since it takes into account the degree of a node's neighbors in the metric (so a node with many high degree neighbors is given a high eigenvector centrality score).
Since the correlation is so linear, we can guess that a rapper's number of collaborators is a good measure of his influence. High degree rappers are collaborating with other high degree rappers, and similarly for low degree rappers. This makes sense -- as a high degree rapper is probably very well known in the industry and has the power to collaborate with other very popular artists. However, a low degree, up and coming rapper who doesn't have the same influence is likely to collaborate with someone also with low degree -- "within his league", so to speak
nx.average_clustering(G_rap)
nx.degree_assortativity_coefficient(G_rap)
Our network doesn't seems to be just slightly assortative and minimally clustered.
Here, we take a look at artists who's neighbors have a the highest average degree. What this is likely to mean is that this artist has collaborated with very influential artists (as we see in the correlation between degree and eigenvector centrality. Looking at some of these artists connections, we can see that they've collaborated with the influential figures mentioned in the degree/eigenvector centrality measures.
# Average Neighbor Degree
sweg = nx.average_neighbor_degree(G_rap)
sorted(sweg.items(), key=operator.itemgetter(1), reverse = True)[:10]
#sorted(dict(sorted_x), key=lambda x: x[1], reverse = True)[:20]
def get_collaborators(artist):
print(artist)
return [neighbor for neighbor in G_rap.neighbors(artist)]
print(get_collaborators("T.R.U."))
print(get_collaborators("Nicole Bus"))
print(get_collaborators("euro"))
print(get_collaborators("Juvenile"))
A clique of artists is a set of artists such that each member in the clique has collaborated with every other member in the clique. Analyzing these can be of value, because they will show us what artists are working closely together, and why that is (same location, same style of rap .. etc) |
# Investigation ongoing ...
We start off by using ForceAtlas to visualize the structure of our network, without labels for now.
#TODO: Commenting and cleaning this section
giant = max([G_rap.subgraph(c) for c in nx.connected_components(G_rap)], key=len)
data = json_graph.node_link_data(giant)
forceatlas2 = ForceAtlas2(
# Behavior alternatives
outboundAttractionDistribution=False, # Dissuade hubs
linLogMode=False, # NOT IMPLEMENTED
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
edgeWeightInfluence=1.5,
# Performance
jitterTolerance=1.0, # Tolerance
barnesHutOptimize=True,
barnesHutTheta=1.2,
multiThreaded=False, # NOT IMPLEMENTED
# Tuning
scalingRatio=0.5,
strongGravityMode=False,
gravity=1,
# Log
verbose=False)
positionsUN = forceatlas2.forceatlas2_networkx_layout(giant, pos=None, iterations=2000)
with open('positionsNetwork.json', 'w') as outfile:
json.dump(positionsUN, outfile)
labelPos = {}
for el in positionsUN:
labelPos[el] = (positionsUN[el][0],positionsUN[el][1]+2)
cmape = colors.LinearSegmentedColormap.from_list('custom blue',
[(0, (0.3, 0.3, 0.3)),
(1, (0,0,0))], N=5)
fig= plt.figure(figsize=(60,60))
degrees = []
for i in giant:
degrees.append(giant.degree[i]*6)
edges,weights = zip(*nx.get_edge_attributes(giant,'weight').items())
resWeights=[]
for w in weights:
if(w<5):
resWeights.append(0.1)
elif(w<10):
resWeights.append(0.3)
elif(w<15):
resWeights.append(0.5)
elif(w<20):
resWeights.append(0.8)
elif(w<25):
resWeights.append(1)
a= nx.draw_networkx_nodes(giant, positionsUN, node_size=degrees, with_labels=False, node_color="blue", alpha=0.9)
b= nx.draw_networkx_edges(giant, positionsUN, edges_list= edges,edge_color=weights, edge_cmap=cmape, width=resWeights )
#c= nx.draw_networkx_labels(giant, labelPos,font_size=12)
plt.savefig('HipHop_US_Network_900.png')
Using the Louvain community detection algorithm, we'd like to see what communities exist in our network and extract some interesting insights. For example, whether rappers of the same cities are working together (east coat vs west coast) for example
edgesWeight = dict(giant.edges)
edgesWeightList = []
for i in edgesWeight:
fromArtist = list(i)[0]
toArtist = list(i)[1]
weight = edgesWeight[i]['weight']
edgesWeightList.append({"from": fromArtist, "to": toArtist, "weight": weight})
with open('nodesDegree.json', 'w') as outfile:
json.dump(list(giant.degree), outfile)
partition = community_louvain.best_partition(giant)
communities = list(set(partition.values()))
colors = list(pltcolors._colors_full_map.values())[0:len(communities)]
cmap = dict(zip(communities, colors))
print("The algorithm has identifies %.0f communities" %len(communities))
Here is a visualization of our network, colored by communities and also assigned labels
plt.figure(figsize = (100,100))
pos = positionsUN
count = 0.
for count, com in enumerate(communities):
list_nodes = [nodes for nodes in partition.keys()
if partition[nodes] == com]
nx.draw_networkx_nodes(giant, pos, list_nodes, node_size=degrees,
node_color = cmap.get(com), alpha= 1)
nx.draw_networkx_edges(giant,pos, alpha=0.06)
nx.draw_networkx_labels(giant, pos,font_size=12)
plt.show()
posX = [i[0] for i in positionsUN]
posY = [i[1] for i in positionsUN]